from decimal import Decimal
import keras
from keras.datasets import fashion_mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import time
start = time.time()
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()
x_train = x_train / 255.0
x_test = x_test / 255.0
def plot_acc(history, title="Model Accuracy"):
"""Imprime una gráfica mostrando la accuracy por epoch obtenida en un entrenamiento"""
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title(title)
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')
plt.show()
def plot_loss(history, title="Model Loss"):
"""Imprime una gráfica mostrando la pérdida por epoch obtenida en un entrenamiento"""
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title(title)
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper right')
plt.show()
def plot_compare_losses(histories, title="Graph title"):
"""Compara losses de dos entrenamientos con nombres name1 y name2"""
legend=[]
colors = ['b','g','r','c','m','y','k']
linestyle = ['-','--','-.',':']
index=0
for history,name in histories:
color = colors[index]
fmt=color+linestyle[0]#Solid
plt.plot(history.history['loss'], fmt)
fmt=color+linestyle[1]#Stroke
plt.plot(history.history['val_loss'], fmt)
legend.append('Train ' + name)
legend.append('Val ' + name)
index=index+1
plt.title(title)
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(legend,
loc='upper right')
plt.show()
#https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.plot.html
def plot_compare_accs(histories, title="Graph title"):
"""Compara accuracies de dos entrenamientos con nombres name1 y name2"""
legend=[]
colors = ['b','g','r','c','m','y','k']
linestyle = ['-','--','-.',':']
index=0
for history,name in histories:
color = colors[index]
fmt=color+linestyle[0]#Solid
plt.plot(history.history['acc'], fmt)
fmt=color+linestyle[1]#Stroke
plt.plot(history.history['val_acc'], fmt)
legend.append('Train ' + name)
legend.append('Val ' + name)
index=index+1
plt.title(title)
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(legend,
loc='lower right')
plt.show()
def plot_compare_all(histories, title="Graph title"):
plot_compare_accs(histories, "Accuracy - "+title)
plot_compare_losses(histories,"Losses - "+title)
# Nota: podéis cambiar los números aquí presentes y ejecutar esta línea si queréis cambiar el tamaño
# de las gráficas
matplotlib.rcParams['figure.figsize'] = [8, 8]
def visualize_example(x):
plt.figure()
plt.imshow(x)
plt.colorbar()
plt.grid(False)
plt.show()
En este ejercicio, vamos a evaluar la importancia de utilizar las unidades de activación adecuadas. Las funciones de activación como sigmoid han dejado de utilizarse en favor de otras unidades como ReLU.
Comparativa de imágenes de funciones de activación
Ejecución Comparativa: Sigmoide vs Relu
Definición de métodos utilizando base el caso de la actividad 1 con las caracteristicas sugeridas.
class train_model_callback(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs={}):
'''
Imprime resultados de los epochs cada 5 iteraciones, tambien notifica si se llega al Accuracy de 90%
'''
acc_achieved=logs.get('val_acc')>0.90
if epoch%5 == 0 or (epoch == self.params.get('epochs', -1)-1):
threshold_achieved_message = '*Threshold Achieved*'if acc_achieved else ''
message=f'Epoch:{epoch} - loss: {"{:.2%}".format(logs.get("loss"))} - acc: { "{:.2%}".format(logs.get("acc"))} - val_loss: {"{:.2%}".format(logs.get("val_loss"))} - val_acc: {"{:.2%}".format(logs.get("val_acc"))} {threshold_achieved_message}'
print(message)
def train_model(x_train,y_train,activation='sigmoid',loss='sparse_categorical_crossentropy',verbose=0, use_early_stop=False):
print(x_train.shape)
model = tf.keras.Sequential([
tf.keras.layers.Flatten(), #one dimension array
tf.keras.layers.Dense(128, activation=activation, kernel_initializer='glorot_uniform'),
tf.keras.layers.Dense(128, activation=activation, kernel_initializer='glorot_uniform'),
tf.keras.layers.Dense(128, activation=activation, kernel_initializer='glorot_uniform'),
tf.keras.layers.Dense(10, activation='softmax')
])
opt = tf.keras.optimizers.SGD(learning_rate = 0.4)
model.compile(optimizer=opt, loss=loss, metrics=['acc'])
callbacks=[train_model_callback()]
if use_early_stop:
early_stopper = tf.keras.callbacks.EarlyStopping(monitor='acc', mode='min', verbose=1, patience=3)#Keras EarlyStop
callbacks.append(early_stopper)
return model.fit(x_train, y_train, epochs=20, batch_size=64,verbose=verbose, callbacks = callbacks,validation_split=0.2)
Ejecución con función de activiación de Sigmoide
sigmoid_history=train_model(x_train,y_train, activation='sigmoid')
(60000, 28, 28) WARNING:tensorflow:From c:\Users\squal\anaconda3\envs\portfolio_rnn\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version. Instructions for updating: Call initializer instance with the dtype argument instead of passing it to the constructor Epoch:0 - loss: 122.72% - acc: 52.28% - val_loss: 98.64% - val_acc: 65.37% Epoch:5 - loss: 40.96% - acc: 85.30% - val_loss: 43.43% - val_acc: 84.27% Epoch:10 - loss: 34.20% - acc: 87.60% - val_loss: 34.86% - val_acc: 87.45% Epoch:15 - loss: 30.30% - acc: 88.72% - val_loss: 32.89% - val_acc: 88.04% Epoch:19 - loss: 28.03% - acc: 89.42% - val_loss: 32.15% - val_acc: 88.41%
Ejecución con función de activiación de Relu
relu_history=train_model(x_train,y_train,activation='relu')
(60000, 28, 28) Epoch:0 - loss: 73.73% - acc: 72.37% - val_loss: 52.83% - val_acc: 80.67% Epoch:5 - loss: 33.38% - acc: 87.53% - val_loss: 38.14% - val_acc: 86.41% Epoch:10 - loss: 28.29% - acc: 89.39% - val_loss: 34.26% - val_acc: 88.03% Epoch:15 - loss: 25.04% - acc: 90.54% - val_loss: 35.45% - val_acc: 88.21% Epoch:19 - loss: 24.26% - acc: 91.01% - val_loss: 37.07% - val_acc: 87.15%
Ejecución con función de activiación de Leaky Relu
leakyrelu_history=train_model(x_train,y_train,tf.keras.layers.LeakyReLU(alpha=0.1))
(60000, 28, 28) Epoch:0 - loss: 68.12% - acc: 74.68% - val_loss: 45.17% - val_acc: 83.21% Epoch:5 - loss: 31.74% - acc: 88.26% - val_loss: 35.17% - val_acc: 87.44% Epoch:10 - loss: 26.46% - acc: 89.87% - val_loss: 32.61% - val_acc: 88.50% Epoch:15 - loss: 23.34% - acc: 91.18% - val_loss: 32.63% - val_acc: 89.03% Epoch:19 - loss: 21.07% - acc: 91.93% - val_loss: 36.47% - val_acc: 88.08%
Análisis comparativo - Loss: Sigmoid vs Relu
histories = [(sigmoid_history,'Sigmoid'),(relu_history,'Relu')]
plot_compare_losses(histories, title="Loss: Sigmoid vs Relu")
Análisis comparativo - Accuracy: Sigmoid vs Relu
histories = [(sigmoid_history,'Sigmoid'),(relu_history,'Relu')]
plot_compare_accs(histories, title='Accuracy: Sigmoid vs Relu')
Análisis comparativo - Accuracy: Relu vs LeakyRelu
histories = [(leakyrelu_history,'LRelu'),(relu_history,'Relu')]
plot_compare_accs(histories, 'Accuracy: Relu vs LRelu')
Mayores Diferencias y breve explicación
Vamos a evaluar la importancia de una correcta inicialización de parámetros en una red neuronal.
Partiendo de una red similar a la del ejercicio anterior (usando ya ReLUs), comentar las diferencias que se aprecian en el entrenamiento al utilizar distintas estrategias de inicialización de parámetros. Para ello, inicializar todas las capas con las siguientes estrategias, disponibles en Keras, y analizar sus diferencias:
Consideraciones al lector:
Para no se aplica early stopping, sin embargo se muestra notificación de accuracy a 90% entrenamientos como Threshold Achieved.
Los dentro del análisis muestran:
Definimos la función base para entrenar
def train_model(x_train,y_train,activation=tf.keras.activations.relu,loss=tf.keras.losses.sparse_categorical_crossentropy,
verbose=0,validation_split=0.2, epochs=20, batch_size=64,
kernel_initializer=tf.keras.initializers.glorot_uniform(),
bias_initializer=tf.keras.initializers.Zeros(),
optimizer=tf.keras.optimizers.SGD(learning_rate = 0.4),
use_early_stop=False
):
model = tf.keras.Sequential([
#tf.keras.layers.Flatten(input_shape=(28,28,1)), #one dimension array
tf.keras.layers.Flatten(), #one dimension array
tf.keras.layers.Dense(128, activation=activation,kernel_initializer=kernel_initializer, bias_initializer=bias_initializer),
tf.keras.layers.Dense(128, activation=activation,kernel_initializer=kernel_initializer, bias_initializer=bias_initializer),
tf.keras.layers.Dense(128, activation=activation,kernel_initializer=kernel_initializer, bias_initializer=bias_initializer),
tf.keras.layers.Dense(128, activation=activation,kernel_initializer=kernel_initializer, bias_initializer=bias_initializer),
tf.keras.layers.Dense(10, activation=tf.keras.activations.softmax)
])
callbacks=[train_model_callback()]
if use_early_stop:
early_stopper = tf.keras.callbacks.EarlyStopping(monitor='acc', mode='min', verbose=1, patience=3)#Keras EarlyStop
callbacks.append(early_stopper)
model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])
history = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,
verbose=verbose, callbacks = callbacks,
validation_split=validation_split)
return history
Inicialización con Ceros (Zeros)
Todos los pesos son inicializados con ceros.
zero_history=train_model(x_train,y_train,kernel_initializer=tf.keras.initializers.Zeros())
Epoch:0 - loss: 230.41% - acc: 10.03% - val_loss: 230.45% - val_acc: 10.05% Epoch:5 - loss: 230.42% - acc: 9.84% - val_loss: 230.41% - val_acc: 9.83% Epoch:10 - loss: 230.41% - acc: 9.96% - val_loss: 230.36% - val_acc: 10.03% Epoch:15 - loss: 230.43% - acc: 9.87% - val_loss: 230.36% - val_acc: 9.57% Epoch:19 - loss: 230.41% - acc: 9.81% - val_loss: 230.38% - val_acc: 10.03%
Inicialización con una variable aleatoria normal (Random Normal)
Pesos son inicializados a una valores aleatores Gaussianos (media cero y distribución estandard de 0.5).
random_normal_history=train_model(x_train,y_train,kernel_initializer=tf.keras.initializers.RandomNormal())
WARNING:tensorflow:From C:\Users\squal\AppData\Local\Temp\ipykernel_22304\1235235368.py:1: The name tf.keras.initializers.RandomNormal is deprecated. Please use tf.compat.v1.keras.initializers.RandomNormal instead. WARNING:tensorflow:From c:\Users\squal\anaconda3\envs\portfolio_rnn\lib\site-packages\tensorflow\python\keras\initializers.py:143: calling RandomNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version. Instructions for updating: Call initializer instance with the dtype argument instead of passing it to the constructor Epoch:0 - loss: 83.47% - acc: 68.08% - val_loss: 48.34% - val_acc: 82.69% Epoch:5 - loss: 34.28% - acc: 87.29% - val_loss: 35.55% - val_acc: 87.06% Epoch:10 - loss: 28.77% - acc: 89.17% - val_loss: 35.07% - val_acc: 87.44% Epoch:15 - loss: 25.26% - acc: 90.48% - val_loss: 36.05% - val_acc: 88.13% Epoch:19 - loss: 23.50% - acc: 91.09% - val_loss: 41.69% - val_acc: 86.27%
Inicialización con los valores por defecto de Keras para una capa Dense (estrategia glorot uniform)
Pesos son inicializados a valores aleatorios uniformemente pequeños entre 0 y 0.5(parametrizable), se le conoce como "Xavier Uniform".
glorot_uniform_history=train_model(x_train,y_train,kernel_initializer=tf.keras.initializers.glorot_uniform())
Epoch:0 - loss: 84.76% - acc: 68.30% - val_loss: 67.33% - val_acc: 76.28% Epoch:5 - loss: 34.98% - acc: 87.03% - val_loss: 37.97% - val_acc: 86.39% Epoch:10 - loss: 29.61% - acc: 88.90% - val_loss: 33.77% - val_acc: 88.05% Epoch:15 - loss: 26.81% - acc: 89.88% - val_loss: 34.72% - val_acc: 87.85% Epoch:19 - loss: 24.95% - acc: 90.50% - val_loss: 34.77% - val_acc: 87.78%
Inicialización con HeUniform
he_uniform_history=train_model(x_train,y_train,kernel_initializer=tf.keras.initializers.he_uniform())
WARNING:tensorflow:From C:\Users\squal\AppData\Local\Temp\ipykernel_22304\352529281.py:1: The name tf.keras.initializers.he_uniform is deprecated. Please use tf.compat.v1.keras.initializers.he_uniform instead. Epoch:0 - loss: 100.39% - acc: 61.61% - val_loss: 68.80% - val_acc: 71.50% Epoch:5 - loss: 38.56% - acc: 85.82% - val_loss: 40.82% - val_acc: 85.18% Epoch:10 - loss: 32.48% - acc: 88.06% - val_loss: 35.41% - val_acc: 87.73% Epoch:15 - loss: 29.19% - acc: 89.18% - val_loss: 37.84% - val_acc: 86.65% Epoch:19 - loss: 27.15% - acc: 89.84% - val_loss: 36.08% - val_acc: 87.83%
Inicialización con RandomUniform
random_uniform_history=train_model(x_train,y_train,kernel_initializer=tf.keras.initializers.RandomUniform())
WARNING:tensorflow:From C:\Users\squal\AppData\Local\Temp\ipykernel_22304\176346725.py:1: The name tf.keras.initializers.RandomUniform is deprecated. Please use tf.compat.v1.keras.initializers.RandomUniform instead. WARNING:tensorflow:From c:\Users\squal\anaconda3\envs\portfolio_rnn\lib\site-packages\tensorflow\python\keras\initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version. Instructions for updating: Call initializer instance with the dtype argument instead of passing it to the constructor Epoch:0 - loss: 104.72% - acc: 58.37% - val_loss: 82.01% - val_acc: 71.14% Epoch:5 - loss: 35.71% - acc: 86.88% - val_loss: 37.20% - val_acc: 86.69% Epoch:10 - loss: 30.08% - acc: 88.74% - val_loss: 37.34% - val_acc: 86.75% Epoch:15 - loss: 26.87% - acc: 89.90% - val_loss: 34.36% - val_acc: 87.94% Epoch:19 - loss: 24.97% - acc: 90.59% - val_loss: 35.04% - val_acc: 88.09%
Comparación de Loss y Accuracy: Zero vs RandomNorm vs GlorotUnit vs HeUniformed
histories = [(zero_history,'Zero'),(random_normal_history,'RandomNormal'),(glorot_uniform_history,'GlorotUniform'),(he_uniform_history,'HeUniform'), (random_uniform_history,'RandomUniform')]
plot_compare_all(histories, title='Zero vs RandomNormal vs GlorotUniform vs heUniform vs RandomUniform')
Por la topografía de la red neuronal que estamos utilizando, inicialización de ceros utilizando RELU, hace dificil a la red aprender, si hacemos unos ajustes en el learning rate y en la activación a Sigmoide, vemos mejoras con la inicialización en cero pero aún no se acerca al rendimiento de las otras inicializaciones.
zero_history=train_model(x_train,y_train,kernel_initializer=tf.keras.initializers.Zeros(),activation='sigmoid', optimizer=tf.keras.optimizers.SGD(learning_rate=0.8))
Epoch:0 - loss: 231.83% - acc: 10.12% - val_loss: 231.02% - val_acc: 9.83% Epoch:5 - loss: 227.26% - acc: 13.44% - val_loss: 214.58% - val_acc: 19.92% Epoch:10 - loss: 146.48% - acc: 38.05% - val_loss: 140.96% - val_acc: 42.34% Epoch:15 - loss: 140.49% - acc: 40.98% - val_loss: 143.59% - val_acc: 33.48% Epoch:19 - loss: 136.83% - acc: 42.13% - val_loss: 145.69% - val_acc: 32.43%
Volvemos a generar la visualización:
histories = [(zero_history,'Zero'),(random_normal_history,'RandomNormal'),(glorot_uniform_history,'GlorotUniform'),(he_uniform_history,'HeUniform'),(random_uniform_history,'RandomUniform')]
plot_compare_all(histories, title='Zero vs RandomNormal vs GlorotUniform vs heUniform vs RandomUniform')
Resumen Media de Accuracy/Loss ordenados por Accuracy en entrenamiento
data_initialization_accuracy = {'Initialization': ['Zero', 'RandomNormal', 'GlorotUniform', 'HeUniform', 'RandomUniform'],
'Accuracy(Training)': [np.max(zero_history.history['acc']),
np.max(random_normal_history.history['acc']),
np.max(glorot_uniform_history.history['acc']),
np.max(he_uniform_history.history['acc']),
np.max(random_uniform_history.history['acc'])],
'Loss(Training)': [np.min(zero_history.history['loss']),
np.min(random_normal_history.history['loss']),
np.min(glorot_uniform_history.history['loss']),
np.min(he_uniform_history.history['loss']),
np.min(random_uniform_history.history['loss'])],
'Accuracy(Validation)': [np.max(zero_history.history['val_acc']),
np.max(random_normal_history.history['val_acc']),
np.max(glorot_uniform_history.history['val_acc']),
np.max(he_uniform_history.history['val_acc']),
np.max(random_uniform_history.history['val_acc'])],
'Loss(Validation)': [np.min(zero_history.history['val_loss']),
np.min(random_normal_history.history['val_loss']),
np.min(glorot_uniform_history.history['val_loss']),
np.min(he_uniform_history.history['val_loss']),
np.min(random_uniform_history.history['val_loss'])],
'Epoc Count': [np.max(zero_history.epoch),
np.max(random_normal_history.epoch),
np.max(glorot_uniform_history.epoch),
np.max(he_uniform_history.epoch),
np.max(random_uniform_history.epoch)]
}
df_data_initialization_accuracy = pd.DataFrame(data_initialization_accuracy)
df_data_initialization_accuracy.sort_values(['Accuracy(Training)','Epoc Count'], ascending=False)
Initialization | Accuracy(Training) | Loss(Training) | Accuracy(Validation) | Loss(Validation) | Epoc Count | |
---|---|---|---|---|---|---|
1 | RandomNormal | 0.910937 | 0.234963 | 0.884667 | 0.328479 | 19 |
4 | RandomUniform | 0.905937 | 0.249724 | 0.884000 | 0.333634 | 19 |
2 | GlorotUniform | 0.905375 | 0.249524 | 0.881500 | 0.335111 | 19 |
3 | HeUniform | 0.898438 | 0.271512 | 0.882667 | 0.341279 | 19 |
0 | Zero | 0.421292 | 1.368329 | 0.483000 | 1.311675 | 19 |
Diferencias / Conclusiones
Continuaremos con la que se utiliza por defecto Random Normal.
Aqui hay detalle de las diferencias en accuracy, convergencia, generalización, etc
Partiendo de una red similar a la del ejercicio anterior (utilizando la mejor estrategia de inicialización observada), comparar y analizar las diferencias que se observan al entrenar con varios de los optimizadores vistos en clase, incluyendo SGD como optimizador básico (se puede explorar el espacio de hiperparámetros de cada optimizador, aunque para optimizadores más avanzados del estilo de adam y RMSprop es buena idea dejar los valores por defecto provistos por Keras).
Optimizando con Stochastic Gradient Descent(SGD)
Proporciona frecuente actualizaciones de pesos con alta varianza por cada muestra, hace que la función fluctúe, para controlar los minimos locales. Utiliza momentum para acelerar la correción. Nesterov es una mejora de la implementación de momentum que le dá dirección sobre la pendiente.
sgd_model=train_model(x_train,y_train,optimizer=tf.keras.optimizers.SGD(learning_rate=0.4))
Epoch:0 - loss: 97.60% - acc: 62.77% - val_loss: 61.71% - val_acc: 74.18% Epoch:5 - loss: 36.70% - acc: 86.51% - val_loss: 37.03% - val_acc: 87.10% Epoch:10 - loss: 32.09% - acc: 88.08% - val_loss: 37.23% - val_acc: 86.47% Epoch:15 - loss: 29.05% - acc: 89.28% - val_loss: 35.26% - val_acc: 87.79% Epoch:19 - loss: 26.42% - acc: 90.16% - val_loss: 38.15% - val_acc: 87.19%
Optimizando con Stochastic Gradient Descent - Adaptive Estimation (ADAM)
Es una optimización del algortimo del stocastic gradiente descent(SGD) para actualizar pesos de la red iterativamente basado en el entrenamiento de data. Mantiene un learning rate(alpha rate) separado. Toma las bondadades de RMSDrop y AdaGrad.
adam_model=train_model(x_train,y_train,optimizer=tf.keras.optimizers.Adam())
Epoch:0 - loss: 52.92% - acc: 81.11% - val_loss: 41.82% - val_acc: 85.03% Epoch:5 - loss: 28.27% - acc: 89.40% - val_loss: 32.02% - val_acc: 88.52% Epoch:10 - loss: 23.35% - acc: 91.19% - val_loss: 33.24% - val_acc: 88.08% Epoch:15 - loss: 19.66% - acc: 92.53% - val_loss: 34.74% - val_acc: 89.02% Epoch:19 - loss: 17.75% - acc: 93.16% - val_loss: 39.26% - val_acc: 88.13%
Optimizando con Gradient Descent - ADAM en Norma Infinita (ADAMAX)
Es una variante de ADAM basado en la normal infinita.
adamax_model=train_model(x_train,y_train,optimizer=tf.keras.optimizers.Adamax())
Epoch:0 - loss: 59.34% - acc: 79.04% - val_loss: 47.55% - val_acc: 82.75% Epoch:5 - loss: 30.73% - acc: 88.85% - val_loss: 33.36% - val_acc: 88.09% Epoch:10 - loss: 25.21% - acc: 90.74% - val_loss: 31.73% - val_acc: 88.67% Epoch:15 - loss: 21.31% - acc: 92.06% - val_loss: 34.18% - val_acc: 88.48% Epoch:19 - loss: 19.06% - acc: 92.85% - val_loss: 33.15% - val_acc: 88.92%
Optimizando con Gradient Descent - Adadelta Adaptive Learning rate per dimension (ADADELTA)
No utiliza learning rate y refiere a la diferencia entre el peso actual y el nuevo peso actualizado.
adadelta_model=train_model(x_train,y_train,optimizer=tf.keras.optimizers.Adadelta())
Epoch:0 - loss: 225.37% - acc: 14.86% - val_loss: 221.20% - val_acc: 19.47% Epoch:5 - loss: 169.96% - acc: 53.48% - val_loss: 163.10% - val_acc: 57.66% Epoch:10 - loss: 121.95% - acc: 64.96% - val_loss: 117.31% - val_acc: 66.09% Epoch:15 - loss: 98.65% - acc: 66.53% - val_loss: 95.69% - val_acc: 67.62% Epoch:19 - loss: 89.03% - acc: 68.08% - val_loss: 86.73% - val_acc: 68.92%
Optimizando con Gradient Descent - Adadelta Parameter-Specific Learning (ADAGRAD)
Adagrad adapta el learning rate individual por feature utilizado.
adagrad_model=train_model(x_train,y_train,optimizer=tf.keras.optimizers.Adagrad())
WARNING:tensorflow:From c:\Users\squal\anaconda3\envs\portfolio_rnn\lib\site-packages\tensorflow\python\keras\optimizer_v2\adagrad.py:105: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version. Instructions for updating: Call initializer instance with the dtype argument instead of passing it to the constructor Epoch:0 - loss: 169.92% - acc: 44.99% - val_loss: 105.30% - val_acc: 65.29% Epoch:5 - loss: 58.30% - acc: 80.21% - val_loss: 56.80% - val_acc: 80.60% Epoch:10 - loss: 49.68% - acc: 82.73% - val_loss: 49.72% - val_acc: 82.78% Epoch:15 - loss: 45.87% - acc: 84.08% - val_loss: 46.58% - val_acc: 83.69% Epoch:19 - loss: 44.00% - acc: 84.72% - val_loss: 44.85% - val_acc: 84.35%
Optimizando con Gradient Descent - Root MeanSquared Propagation(RMSDROP) Extensión del Adagrad, en vez de acumular momentum acumula gradientes por momentum.
rmsdrop_model=train_model(x_train,y_train,optimizer=tf.keras.optimizers.RMSprop())
Epoch:0 - loss: 55.43% - acc: 79.50% - val_loss: 47.33% - val_acc: 82.22% Epoch:5 - loss: 30.95% - acc: 88.73% - val_loss: 37.30% - val_acc: 87.45% Epoch:10 - loss: 29.01% - acc: 89.55% - val_loss: 40.72% - val_acc: 87.76% Epoch:15 - loss: 28.03% - acc: 90.25% - val_loss: 49.47% - val_acc: 88.46% Epoch:19 - loss: 27.35% - acc: 90.55% - val_loss: 46.75% - val_acc: 89.06%
Generamos visualización:
histories = [(sgd_model,'SGD'),(adam_model,'ADAM'),(adamax_model,'AdamMax'),(adadelta_model,'AdaDelta'),(adagrad_model,'AdaGrad'),(rmsdrop_model,'RMSDrop')]
plot_compare_all(histories, title='SGD vs ADAM vs ADAMAX vs ADADELTA vs ADAGRAD vs RMSDROP')
data_optimizer_accuracy = {'Optimizer': ['SGD', 'ADAM', 'ADAMAX','ADADELTA','ADAGRAD','RMSDROP'],
'Accuracy(Training)':[np.max(sgd_model.history['acc']),
np.max(adam_model.history['acc']),
np.max(adamax_model.history['acc']),
np.max(adadelta_model.history['acc']),
np.max(adagrad_model.history['acc']),
np.max(rmsdrop_model.history['acc'])],
'Loss(Training)': [np.min(sgd_model.history['loss']),
np.min(adam_model.history['loss']),
np.min(adamax_model.history['loss']),
np.min(adadelta_model.history['loss']),
np.min(adagrad_model.history['loss']),
np.min(rmsdrop_model.history['loss'])],
'Accuracy(Validation)': [np.max(sgd_model.history['val_acc']),
np.max(adam_model.history['val_acc']),
np.max(adamax_model.history['val_acc']),
np.max(adadelta_model.history['val_acc']),
np.max(adagrad_model.history['val_acc']),
np.max(rmsdrop_model.history['val_acc'])],
'Loss(Validation)': [np.min(sgd_model.history['val_loss']),
np.min(adam_model.history['val_loss']),
np.min(adamax_model.history['val_loss']),
np.min(adadelta_model.history['val_loss']),
np.min(adagrad_model.history['val_loss']),
np.min(rmsdrop_model.history['val_loss'])],
'Epoc Count': [np.max(sgd_model.epoch),
np.max(adam_model.epoch),
np.max(adamax_model.epoch),
np.max(adadelta_model.epoch),
np.max(adagrad_model.epoch),
np.max(rmsdrop_model.epoch)]
}
df_data_optimizer_accuracy = pd.DataFrame(data_optimizer_accuracy)
df_data_optimizer_accuracy.sort_values(['Accuracy(Training)','Epoc Count'], ascending=False)
Optimizer | Accuracy(Training) | Loss(Training) | Accuracy(Validation) | Loss(Validation) | Epoc Count | |
---|---|---|---|---|---|---|
1 | ADAM | 0.931604 | 0.177453 | 0.890167 | 0.320193 | 19 |
2 | ADAMAX | 0.928521 | 0.190614 | 0.891750 | 0.311768 | 19 |
5 | RMSDROP | 0.906167 | 0.271632 | 0.890583 | 0.369149 | 19 |
0 | SGD | 0.901583 | 0.264224 | 0.881750 | 0.350593 | 19 |
4 | ADAGRAD | 0.847208 | 0.440043 | 0.843500 | 0.448466 | 19 |
3 | ADADELTA | 0.680792 | 0.890299 | 0.689250 | 0.867338 | 19 |
Diferencias / Conclusiones
Vamos entrenar una red final que sea capaz de obtener una accuracy en el validation set cercana al 90%. Para ello, combinar todo lo aprendido anteriormente y utilizar técnicas de regularización para evitar overfitting. Algunos de los elementos que pueden tenerse en cuenta son los siguientes.
Se proponen dos estrategias para llegar al 90% accuracy utilizando:
Estrategia e Hyperparámetros utilizados sin Redes Convolucionales
def train_open_model(x_train,y_train,activation=tf.keras.layers.LeakyReLU(alpha=0.1),loss=tf.keras.losses.sparse_categorical_crossentropy,
verbose=0,validation_split=0.2, epochs=20, batch_size=64,
kernel_initializer=tf.keras.initializers.RandomNormal(),
bias_initializer=tf.keras.initializers.Zeros(),
optimizer=tf.keras.optimizers.Adam(),
use_early_stop=True,
kernel_regularizer=None
):
model = tf.keras.Sequential([
tf.keras.layers.Flatten(), #one dimension array
tf.keras.layers.BatchNormalization(), #Normalize picture values
tf.keras.layers.Dense(256, activation=activation,kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,kernel_regularizer=kernel_regularizer),
tf.keras.layers.Dense(128, activation=activation,kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,kernel_regularizer=kernel_regularizer),
tf.keras.layers.Dense(64, activation=activation,kernel_initializer=kernel_initializer, bias_initializer=bias_initializer,kernel_regularizer=kernel_regularizer),
tf.keras.layers.Dense(10, activation=tf.keras.activations.softmax)
])
model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])
callbacks=[]
if use_early_stop:
early_stopper = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)#Keras EarlyStop
callbacks.append(early_stopper)
history = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,
verbose=verbose, callbacks = callbacks,
validation_split=validation_split)
return history
Configuración e Hyperparámetros utilizados con Convolucionales
def train_convolutional_model(x_train,y_train,activation=tf.keras.activations.relu,loss=tf.keras.losses.sparse_categorical_crossentropy,
verbose=0,validation_split=0.2, epochs=20, batch_size=64,
kernel_initializer=tf.keras.initializers.RandomNormal(),
bias_initializer=tf.keras.initializers.Zeros(),
optimizer=tf.keras.optimizers.Adam(),
use_early_stop=True,
kernel_regularizer=None
):
x_train = x_train.reshape((x_train.shape[0], 28, 28, 1)) #GreyScale
model = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(64, (3,3), activation=activation, input_shape=(28, 28, 1)),
tf.keras.layers.MaxPooling2D(2, 2),
tf.keras.layers.Conv2D(64, (3,3), activation=activation),
tf.keras.layers.MaxPooling2D(2,2),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation=activation),
tf.keras.layers.Dense(10, activation=tf.keras.activations.softmax)
])
model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])
callbacks=[]
if use_early_stop:
early_stopper = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)#Keras EarlyStop
callbacks.append(early_stopper)
history = model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size,
verbose=verbose, callbacks = callbacks,
validation_split=validation_split)
return history
convolutional_history=train_convolutional_model(x_train,y_train, verbose=False,use_early_stop=True)
Epoch 00014: early stopping
Early Stop y Regularization(L1,L2,L1|L2)
Entrenamiento Sin Early Stop
no_early_stop_history=train_open_model(x_train,y_train,use_early_stop=False)
Entrenamiento con Early Stop
early_stop_history=train_open_model(x_train,y_train)
Epoch 00010: early stopping
Entrenamiento con Regularización con L1 (con Early Stop)
L1_history=train_open_model(x_train,y_train, kernel_regularizer=tf.keras.regularizers.l1())
Entrenamiento con Regularización con L2 (con Early Stop)
L2_history=train_open_model(x_train,y_train, kernel_regularizer=tf.keras.regularizers.l2())
Entrenamiento con Regularización con L1/L2 (con Early Stop)
L1L2_history=train_open_model(x_train,y_train, kernel_regularizer=tf.keras.regularizers.l1_l2())
Generamos Visualización y comparamos resultados
histories = [(early_stop_history,'Early Stop'),(no_early_stop_history,'No Early Stop'),(L1_history,'L1'),(L2_history,'L2'),(L1L2_history,'L1L2'),(convolutional_history,'Convolutional')]
plot_compare_all(histories, title='Early Stop')
data_open_accuracy = {'Hyperparameter': ['Early Stop', 'No Early Stop', 'L1(Early Stop)','L2(Early Stop)','L1|L2(Early Stop)','Convolutional'],
'Accuracy(Training)':[np.max(early_stop_history.history['acc']),
np.max(no_early_stop_history.history['acc']),
np.max(L1_history.history['acc']),
np.max(L2_history.history['acc']),
np.max(L1L2_history.history['acc']),
np.max(convolutional_history.history['acc'])],
'Loss(Training)': [np.min(early_stop_history.history['loss']),
np.min(no_early_stop_history.history['loss']),
np.min(L1_history.history['loss']),
np.min(L2_history.history['loss']),
np.min(L1L2_history.history['loss']),
np.min(convolutional_history.history['loss'])],
'Accuracy(Validation)': [np.max(early_stop_history.history['val_acc']),
np.max(no_early_stop_history.history['val_acc']),
np.max(L1_history.history['val_acc']),
np.max(L2_history.history['val_acc']),
np.max(L1L2_history.history['val_acc']),
np.max(convolutional_history.history['val_acc'])],
'Loss(Validation)': [np.min(early_stop_history.history['val_loss']),
np.min(no_early_stop_history.history['val_loss']),
np.min(L1_history.history['val_loss']),
np.min(L2_history.history['val_loss']),
np.min(L1L2_history.history['val_loss']),
np.min(convolutional_history.history['val_loss'])],
'Epoc Count': [np.max(early_stop_history.epoch),
np.max(no_early_stop_history.epoch),
np.max(L1_history.epoch),
np.max(L2_history.epoch),
np.max(L1L2_history.epoch),
np.max(convolutional_history.epoch)]
}
df_data_open_accuracy = pd.DataFrame(data_open_accuracy)
df_data_open_accuracy.sort_values(['Accuracy(Training)','Epoc Count'], ascending=False)
Hyperparameter | Accuracy(Training) | Loss(Training) | Accuracy(Validation) | Loss(Validation) | Epoc Count | |
---|---|---|---|---|---|---|
5 | Convolutional | 0.966958 | 0.087524 | 0.915750 | 0.246085 | 13 |
1 | No Early Stop | 0.953958 | 0.121141 | 0.894333 | 0.317439 | 19 |
0 | Early Stop | 0.929250 | 0.184695 | 0.894583 | 0.311035 | 9 |
3 | L2(Early Stop) | 0.856875 | 0.503115 | 0.864583 | 0.482542 | 19 |
2 | L1(Early Stop) | 0.800917 | 1.047516 | 0.810583 | 1.019207 | 19 |
4 | L1|L2(Early Stop) | 0.793792 | 1.082609 | 0.807333 | 1.048476 | 19 |
Observamos que utilizar redes convolucionales mejora el rendimiento en accuracy/loss con menor cantidad de epoch y lo visualizamos su estructura y resultados:
tf.keras.utils.plot_model(convolutional_history.model, show_shapes=True)
Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.
histories = [(convolutional_history,'Model')]
plot_compare_all(histories, title='Model')
Una vez elegido el que creemos que es nuestro mejor modelo a partir de la estimación que hemos visto en los datos de validación, es hora de utilizar los datos de test para ver cómo se comporta nuestro modelo ante nuevos datos. Si hemos hecho bien las cosas, este número debería ser parecido al valor de nuestra estimación vista en los datos de validación.
Resultado Sin Redes Convolucionales Usando el modelo "Early Stop" para obtener su Test Accuracy y Test Loss
test_loss, test_acc = early_stop_history.model.evaluate(x_test, y_test)
print(f'Test Accuracy : {test_acc} - Test Loss: {test_loss}')
10000/10000 [==============================] - 0s 39us/sample - loss: 0.3448 - acc: 0.8886 Test Accuracy : 0.8885999917984009 - Test Loss: 0.344839344727993
Resultado con Redes convolucionales
x_test = x_test.reshape((x_test.shape[0], 28, 28, 1)) #GreyScale
conv_test_loss, conv_test_acc = convolutional_history.model.evaluate(x_test, y_test)
print(f'Test Accuracy : {conv_test_acc} - Test Loss: {conv_test_loss}')
10000/10000 [==============================] - 1s 137us/sample - loss: 0.3236 - acc: 0.9048 Test Accuracy : 0.9047999978065491 - Test Loss: 0.32356072134375574
Pruebas
Tomamos las predicciones y las probabilidades:
evaluation_model=convolutional_history.model
test_predictions = evaluation_model.predict(x_test)
test_probabilities = np.argmax(evaluation_model.predict(x_test), axis=-1)
Se crea lista que tiene los nombres de las clases. Dada que la última capa es softmax obtenemos la probabilidad de cada clase y se obtiene el mayor para determinar su clase.
fashion_class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
model_class_predictions = np.argmax(test_predictions, axis = 1)
Visualizamos y comparamos muestras
samples = [50 , 100 , 200, 5000, 6000]
for sample_index in samples:
print(f'Predicción Muestra Indice({sample_index}) = {model_class_predictions[sample_index]}({fashion_class_names[model_class_predictions[sample_index]]})')
visualize_example(x_test[sample_index])
Predicción Muestra Indice(50) = 4(Coat)
Predicción Muestra Indice(100) = 3(Dress)
Predicción Muestra Indice(200) = 1(Trouser)
Predicción Muestra Indice(5000) = 2(Pullover)
Predicción Muestra Indice(6000) = 1(Trouser)
end = time.time()
print((end - start)/60)
13.78415114879608